Kapitel 6.9: Informationsvermittlung¶
Das Notebook ergänzt Kapitel 6.9 'Informationsvermittlung'.
Import¶
In [1]:
import pandas as pd
import numpy as np
from resources_statistics import *
from resources_geschichtslyrik import *
import plotly.express as px
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator
from tqdm.notebook import tqdm
In [2]:
meta = pd.read_json(r"../resources/meta.json")
Merkmale hinzufügen¶
In [3]:
meta['wissen_behandelt'] = [1 if pd.notna(x) else 0 for x in meta['wissen']]
meta['count'] = meta.query("corpus=='anth'").groupby('author_title')['author_title'].transform('count')
Korpora¶
Korpora erstellen¶
In [4]:
meta_anth = (
meta
.query("corpus=='anth'")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
meta_anth_bin = binarize_meta(meta_anth)
In [5]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
meta_modcanon = (
meta
.query("author in @modcanon_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
In [6]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']
meta_muench = (
meta
.query("author in @muench_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
)
In [7]:
sub_df = pd.DataFrame()
sub_names = ['Anthologien', 'Kanonisierte Moderne', 'Münchhausen-Kreis']
sub_metas = [meta_anth, meta_modcanon, meta_muench]
Merkmale berechnen¶
In [8]:
for this_name, this_meta in zip(sub_names, sub_metas):
sub_df.loc[this_name, 'Jahr'] = round(this_meta['year'].mean(), 0)
sub_df.loc[this_name, 'Texte'] = this_meta.shape[0]
sub_df.loc[this_name, 'wissen_neutral'] = this_meta.query("wissen.isna()").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'wissen_positiv'] = this_meta.query("wissen == 1").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'wissen_negativ'] = this_meta.query("wissen == -1").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'wissen_ambivalent'] = this_meta.query("wissen == 0").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'uebereinstimmend'] = this_meta.query("verhaeltnis_wissen == 'übereinstimmend'").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'ergaenzend'] = this_meta.query("verhaeltnis_wissen == 'ergänzend'").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'abweichend_natuerlich'] = this_meta.query("verhaeltnis_wissen.str.contains('abweichend_natürlich')").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'abweichend_uebernatuerlich'] = this_meta.query("verhaeltnis_wissen.str.contains('abweichend_übernatürlich')").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'marker_person'] = this_meta.query("marker_person != '/'").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'marker_zeit'] = this_meta.query("marker_zeit != '/'").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'marker_ort'] = this_meta.query("marker_ort != '/'").shape[0]/this_meta.shape[0]
sub_df.loc[this_name, 'marker_objekt'] = this_meta.query("marker_objekt != '/'").shape[0]/this_meta.shape[0]
marker_all_counts = (this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']] != '/').sum(axis=1)
marker_title_counts = [x.count('Titel') for x in this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)]
marker_text_counts = [x.count('Text') for x in this_meta[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)]
sub_df.loc[this_name, 'marker_all_per_text'] = marker_all_counts.sum()/this_meta.shape[0]
sub_df.loc[this_name, 'marker_text_per_text'] = np.sum(marker_text_counts)/this_meta.shape[0]
sub_df.loc[this_name, 'marker_title_per_text'] = np.sum(marker_title_counts)/this_meta.shape[0]
sub_df.loc[this_name, '0marker'] = len([x for x in marker_all_counts if x == 0])/this_meta.shape[0]
sub_df.loc[this_name, '1marker'] = len([x for x in marker_all_counts if x == 1])/this_meta.shape[0]
sub_df.loc[this_name, '2marker'] = len([x for x in marker_all_counts if x == 2])/this_meta.shape[0]
sub_df.loc[this_name, '3marker'] = len([x for x in marker_all_counts if x == 3])/this_meta.shape[0]
sub_df.loc[this_name, '4marker'] = len([x for x in marker_all_counts if x == 4])/this_meta.shape[0]
In [9]:
round(sub_df, 4)
Out[9]:
| Jahr | Texte | wissen_neutral | wissen_positiv | wissen_negativ | wissen_ambivalent | uebereinstimmend | ergaenzend | abweichend_natuerlich | abweichend_uebernatuerlich | ... | marker_ort | marker_objekt | marker_all_per_text | marker_text_per_text | marker_title_per_text | 0marker | 1marker | 2marker | 3marker | 4marker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Anthologien | 1875.0 | 1850.0 | 0.9119 | 0.0405 | 0.0330 | 0.0146 | 0.1427 | 0.7476 | 0.0032 | 0.1070 | ... | 0.2768 | 0.5978 | 2.1227 | 2.0151 | 0.7038 | 0.0265 | 0.2286 | 0.4114 | 0.2627 | 0.0708 |
| Kanonisierte Moderne | 1903.0 | 113.0 | 0.8938 | 0.0177 | 0.0531 | 0.0354 | 0.1593 | 0.7257 | 0.0000 | 0.1150 | ... | 0.3097 | 0.4336 | 1.5929 | 1.1062 | 0.7080 | 0.0619 | 0.3982 | 0.4425 | 0.0796 | 0.0177 |
| Münchhausen-Kreis | 1905.0 | 140.0 | 0.8571 | 0.0286 | 0.0714 | 0.0429 | 0.1214 | 0.7714 | 0.0000 | 0.1071 | ... | 0.1357 | 0.6429 | 1.9071 | 1.8643 | 0.5643 | 0.0429 | 0.2929 | 0.4214 | 0.2000 | 0.0429 |
3 rows × 22 columns
Zeitverlauf¶
In [10]:
ts = pd.DataFrame()
ts.index = pd.Series(range(1850, 1919), name = 'year')
In [11]:
ts['text_count'] = meta_anth.groupby('year').size()
ts['text_count'] = ts['text_count'].fillna(0)
ts['text_sum'] = smooth(ts['text_count'], mode = 'sum')
In [12]:
ts['wissen_neutral_count'] = [meta_anth.query("year == @x and wissen.isna()").shape[0] for x in ts.index]
ts['wissen_neutral_sum'] = smooth(ts['wissen_neutral_count'], mode = 'sum')
ts['wissen_neutral_share_smoothed'] = ts['wissen_neutral_sum']/ts['text_sum']
ts['wissen_positiv_count'] = [meta_anth.query("year == @x and wissen == 1").shape[0] for x in ts.index]
ts['wissen_positiv_sum'] = smooth(ts['wissen_positiv_count'], mode = 'sum')
ts['wissen_positiv_share_smoothed'] = ts['wissen_positiv_sum']/ts['text_sum']
ts['wissen_negativ_count'] = [meta_anth.query("year == @x and wissen == -1").shape[0] for x in ts.index]
ts['wissen_negativ_sum'] = smooth(ts['wissen_negativ_count'], mode = 'sum')
ts['wissen_negativ_share_smoothed'] = ts['wissen_negativ_sum']/ts['text_sum']
ts['wissen_ambivalent_count'] = [meta_anth.query("year == @x and wissen == 0").shape[0] for x in ts.index]
ts['wissen_ambivalent_sum'] = smooth(ts['wissen_ambivalent_count'], mode = 'sum')
ts['wissen_ambivalent_share_smoothed'] = ts['wissen_ambivalent_sum']/ts['text_sum']
ts['uebereinstimmend_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen == 'übereinstimmend'").shape[0] for x in ts.index]
ts['uebereinstimmend_sum'] = smooth(ts['uebereinstimmend_count'], mode = 'sum')
ts['uebereinstimmend_share_smoothed'] = ts['uebereinstimmend_sum']/ts['text_sum']
ts['ergaenzend_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen == 'ergänzend'").shape[0] for x in ts.index]
ts['ergaenzend_sum'] = smooth(ts['ergaenzend_count'], mode = 'sum')
ts['ergaenzend_share_smoothed'] = ts['ergaenzend_sum']/ts['text_sum']
ts['abweichend_natuerlich_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen.str.contains('abweichend_natürlich')").shape[0] for x in ts.index]
ts['abweichend_natuerlich_sum'] = smooth(ts['abweichend_natuerlich_count'], mode = 'sum')
ts['abweichend_natuerlich_share_smoothed'] = ts['abweichend_natuerlich_sum']/ts['text_sum']
ts['abweichend_uebernatuerlich_count'] = [meta_anth.query("year == @x and verhaeltnis_wissen.str.contains('abweichend_übernatürlich')").shape[0] for x in ts.index]
ts['abweichend_uebernatuerlich_sum'] = smooth(ts['abweichend_uebernatuerlich_count'], mode = 'sum')
ts['abweichend_uebernatuerlich_share_smoothed'] = ts['abweichend_uebernatuerlich_sum']/ts['text_sum']
ts['marker_person_count'] = [meta_anth.query("year == @x and marker_person != '/'").shape[0] for x in ts.index]
ts['marker_person_sum'] = smooth(ts['marker_person_count'], mode = 'sum')
ts['marker_person_share_smoothed'] = ts['marker_person_sum']/ts['text_sum']
ts['marker_zeit_count'] = [meta_anth.query("year == @x and marker_zeit != '/'").shape[0] for x in ts.index]
ts['marker_zeit_sum'] = smooth(ts['marker_zeit_count'], mode = 'sum')
ts['marker_zeit_share_smoothed'] = ts['marker_zeit_sum']/ts['text_sum']
ts['marker_ort_count'] = [meta_anth.query("year == @x and marker_ort != '/'").shape[0] for x in ts.index]
ts['marker_ort_sum'] = smooth(ts['marker_ort_count'], mode = 'sum')
ts['marker_ort_share_smoothed'] = ts['marker_ort_sum']/ts['text_sum']
ts['marker_objekt_count'] = [meta_anth.query("year == @x and marker_objekt != '/'").shape[0] for x in ts.index]
ts['marker_objekt_sum'] = smooth(ts['marker_objekt_count'], mode = 'sum')
ts['marker_objekt_share_smoothed'] = ts['marker_objekt_sum']/ts['text_sum']
for year in ts.index:
meta_year = meta_anth.query("year == @year")
marker_all_counts = (meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']] != '/').sum(axis=1)
ts.loc[year, 'marker_all_count'] = marker_all_counts.sum()
ts.loc[year, 'marker_title_count'] = np.sum([x.count('Titel') for x in meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)])
ts.loc[year, 'marker_text_count'] = np.sum([x.count('Text') for x in meta_year[['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt']].sum(axis=1)])
ts.loc[year, '0marker_count'] = len([x for x in marker_all_counts if x == 0])
ts.loc[year, '1marker_count'] = len([x for x in marker_all_counts if x == 1])
ts.loc[year, '2marker_count'] = len([x for x in marker_all_counts if x == 2])
ts.loc[year, '3marker_count'] = len([x for x in marker_all_counts if x == 3])
ts.loc[year, '4marker_count'] = len([x for x in marker_all_counts if x == 4])
ts['marker_all_sum'] = smooth(ts['marker_all_count'], mode = 'sum')
ts['marker_all_per_text_smoothed'] = ts['marker_all_sum']/ts['text_sum']
ts['marker_title_sum'] = smooth(ts['marker_title_count'], mode = 'sum')
ts['marker_title_per_text_smoothed'] = ts['marker_title_sum']/ts['text_sum']
ts['marker_text_sum'] = smooth(ts['marker_text_count'], mode = 'sum')
ts['marker_text_per_text_smoothed'] = ts['marker_text_sum']/ts['text_sum']
ts['0marker_sum'] = smooth(ts['0marker_count'], mode = 'sum')
ts['0marker_share_smoothed'] = ts['0marker_sum']/ts['text_sum']
ts['1marker_sum'] = smooth(ts['1marker_count'], mode = 'sum')
ts['1marker_share_smoothed'] = ts['1marker_sum']/ts['text_sum']
ts['2marker_sum'] = smooth(ts['2marker_count'], mode = 'sum')
ts['2marker_share_smoothed'] = ts['2marker_sum']/ts['text_sum']
ts['3marker_sum'] = smooth(ts['3marker_count'], mode = 'sum')
ts['3marker_share_smoothed'] = ts['3marker_sum']/ts['text_sum']
ts['4marker_sum'] = smooth(ts['4marker_count'], mode = 'sum')
ts['4marker_share_smoothed'] = ts['4marker_sum']/ts['text_sum']
Wissen der Sprechinstanz¶
In [13]:
sub_df[[
'wissen_neutral',
'wissen_positiv',
'wissen_negativ',
'wissen_ambivalent'
]].T
Out[13]:
| Anthologien | Kanonisierte Moderne | Münchhausen-Kreis | |
|---|---|---|---|
| wissen_neutral | 0.911892 | 0.893805 | 0.857143 |
| wissen_positiv | 0.040541 | 0.017699 | 0.028571 |
| wissen_negativ | 0.032973 | 0.053097 | 0.071429 |
| wissen_ambivalent | 0.014595 | 0.035398 | 0.042857 |
In [14]:
meta_plot = ts[[
'wissen_neutral_share_smoothed',
'wissen_positiv_share_smoothed',
'wissen_negativ_share_smoothed',
'wissen_ambivalent_share_smoothed',
]]
meta_plot.columns = [
'neutral',
'wissend',
'unwissend',
'ambivalent',
]
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil an Texten',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['wissen_neutral', 'wissen_positiv', 'wissen_negativ', 'wissen_ambivalent']
)
fig.show()
wissen_behandelt¶
In [15]:
main_feature = 'wissen_behandelt'
In [16]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(15)
Out[16]:
wissen_behandelt 1.000000 wissend 0.661295 unwissend 0.594051 nation_volk_d_negativ 0.218218 sprechinstanz_markiert 0.198099 sprechinstanz_nicht_in_vergangenheit 0.169545 sprechakt_fragen_vorhanden 0.138801 geschichtsauffassung_negativ 0.113254 gegenwartsdominant 0.111109 sprechakte_count 0.091551 sprechakt_beschreiben_vorhanden 0.091547 geschichtsauffassung 0.089392 gegenwartsbezug 0.087330 zustand 0.082561 sprechakt_behaupten_vorhanden 0.076391 Name: wissen_behandelt, dtype: float64
In [17]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(15)
Out[17]:
geschichtsauffassung_positiv -0.365676 nation_volk_d_positiv -0.259097 sprechakt_erzaehlen_vorhanden -0.144987 ereignis -0.115277 liebe_positiv -0.113312 ballade -0.112486 konkretheit -0.093721 in_hohem_mass_konkret -0.090452 unbekanntes_individuum_negativ -0.089703 tod_negativ -0.084034 mittelalter -0.068044 entity_count -0.062010 bekanntes_individuum_positiv -0.058035 unbekanntes_individuum_count -0.055522 religion_negativ -0.055331 Name: wissen_behandelt, dtype: float64
In [18]:
threshold = 0.15
bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [19]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [20]:
directly_related = [
'wissend', 'unwissend'
]
results_filtered = (
results
.query("index not in @directly_related")
.query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
.sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[20]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechinstanz_markiert | 0.41 | 688/1687 | 0.75 | 123/163 | 0.28 | 0.28 | 0.35 | 0.42 | 0.42 | 72.60 | 0.0 | 0.0 | 0.20 | 40.0 | 71.46 |
| sprechinstanz_nicht_in_vergangenheit | 0.25 | 428/1687 | 0.52 | 85/163 | 0.19 | 0.19 | 0.27 | 0.35 | 0.35 | 53.18 | 0.0 | 0.0 | 0.17 | 78.0 | 45.20 |
| geschichtsauffassung_positiv | 0.43 | 26/61 | 0.00 | 0/16 | -0.54 | -0.55 | -0.43 | -0.30 | -0.31 | 10.30 | 0.0 | 0.0 | 0.37 | 0.0 | 5.40 |
In [21]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[21]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wissend | 0.00 | 0/1687 | 0.46 | 75/163 | 0.39 | 0.38 | 0.46 | 0.54 | 0.53 | 809.03 | 0.00 | 0.00 | 0.66 | 0.0 | 6.61 |
| unwissend | 0.00 | 0/1687 | 0.37 | 61/163 | 0.29 | 0.30 | 0.37 | 0.45 | 0.45 | 652.86 | 0.00 | 0.00 | 0.59 | 0.0 | 5.37 |
| nation_volk_d_negativ | 0.08 | 4/49 | 0.29 | 2/7 | -0.10 | -0.14 | 0.20 | 0.55 | 0.61 | 2.67 | 0.10 | 0.16 | 0.22 | 2.0 | 0.75 |
| entity_ambivalent | 0.06 | 216/3599 | 0.09 | 28/317 | -0.00 | -0.00 | 0.03 | 0.06 | 0.06 | 4.00 | 0.05 | 0.05 | 0.03 | 28.0 | 19.75 |
| kollektiv_positiv | 0.37 | 361/974 | 0.39 | 35/90 | -0.08 | -0.09 | 0.02 | 0.12 | 0.12 | 0.12 | 0.73 | 0.73 | 0.01 | 35.0 | 33.50 |
| stoffgebiet_neutral | 0.19 | 436/2312 | 0.20 | 46/228 | -0.04 | -0.04 | 0.01 | 0.07 | 0.07 | 0.23 | 0.63 | 0.66 | 0.01 | 46.0 | 43.27 |
| stoffgebiet_ambivalent | 0.13 | 306/2312 | 0.14 | 32/228 | -0.04 | -0.04 | 0.01 | 0.06 | 0.06 | 0.12 | 0.73 | 0.76 | 0.01 | 32.0 | 30.34 |
| entity_neutral | 0.30 | 1062/3599 | 0.30 | 95/317 | -0.05 | -0.05 | 0.00 | 0.06 | 0.06 | 0.03 | 0.86 | 0.85 | 0.00 | 95.0 | 93.66 |
| bekanntes_individuum_negativ | 0.14 | 252/1850 | 0.14 | 24/175 | -0.05 | -0.05 | 0.00 | 0.05 | 0.05 | 0.00 | 0.97 | 1.00 | 0.00 | 24.0 | 23.85 |
| entity_negativ | 0.16 | 593/3599 | 0.16 | 51/317 | -0.04 | -0.05 | -0.00 | 0.04 | 0.04 | 0.03 | 0.86 | 0.94 | 0.00 | 51.0 | 52.13 |
| stoffgebiet_positiv | 0.47 | 1079/2312 | 0.46 | 105/228 | -0.07 | -0.07 | -0.01 | 0.06 | 0.06 | 0.03 | 0.86 | 0.89 | 0.00 | 105.0 | 106.28 |
| kollektiv_negativ | 0.25 | 246/974 | 0.24 | 22/90 | -0.09 | -0.10 | -0.01 | 0.08 | 0.09 | 0.03 | 0.87 | 1.00 | 0.01 | 22.0 | 22.67 |
| stoffgebiet_negativ | 0.21 | 491/2312 | 0.20 | 45/228 | -0.07 | -0.07 | -0.02 | 0.04 | 0.05 | 0.28 | 0.60 | 0.67 | 0.01 | 45.0 | 48.11 |
| entity_positiv | 0.48 | 1728/3599 | 0.45 | 143/317 | -0.08 | -0.09 | -0.03 | 0.03 | 0.03 | 0.98 | 0.32 | 0.35 | 0.02 | 143.0 | 151.46 |
| unbekanntes_individuum_positiv | 0.35 | 206/596 | 0.31 | 11/35 | -0.19 | -0.19 | -0.03 | 0.13 | 0.13 | 0.14 | 0.70 | 0.86 | 0.02 | 11.0 | 12.04 |
| bekanntes_individuum_positiv | 0.58 | 1065/1850 | 0.50 | 87/175 | -0.15 | -0.16 | -0.08 | -0.00 | -0.00 | 4.02 | 0.04 | 0.05 | 0.04 | 87.0 | 75.44 |
| unbekanntes_individuum_negativ | 0.13 | 80/596 | 0.03 | 1/35 | -0.15 | -0.17 | -0.11 | -0.04 | -0.03 | 3.30 | 0.07 | 0.07 | 0.07 | 1.0 | 4.49 |
| nation_volk_d_positiv | 0.78 | 38/49 | 0.43 | 3/7 | -0.71 | -0.73 | -0.35 | 0.04 | 0.04 | 3.76 | 0.05 | 0.07 | 0.26 | 3.0 | 1.88 |
In [22]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]
results_a = relations_binbin(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_binbin(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']
round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[22]:
| wenn_nicht_1850 | wenn_nicht_detail_1850 | wenn_ja_1850 | wenn_ja_detail_1850 | diff_1850 | chi2_p_1850 | phi_1850 | wenn_nicht_1885 | wenn_nicht_detail_1885 | wenn_ja_1885 | wenn_ja_detail_1885 | diff_1885 | chi2_p_1885 | phi_1885 | diff_of_diffs | diff_of_phis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| geschichtsauffassung_positiv | 0.395 | 17/43 | 0.000 | 0/14 | -0.395 | 0.005 | 0.372 | 0.500 | 9/18 | 0.000 | 0/2 | -0.500 | 0.178 | 0.302 | -0.105 | -0.070 |
| sprechinstanz_nicht_in_vergangenheit | 0.261 | 319/1222 | 0.540 | 67/124 | 0.279 | 0.000 | 0.179 | 0.234 | 109/465 | 0.462 | 18/39 | 0.227 | 0.002 | 0.140 | -0.052 | -0.039 |
| sprechinstanz_markiert | 0.408 | 499/1222 | 0.734 | 91/124 | 0.326 | 0.000 | 0.190 | 0.406 | 189/465 | 0.821 | 32/39 | 0.414 | 0.000 | 0.223 | 0.089 | 0.033 |
In [23]:
results = relations_bincont(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = cont_comp_features
)
In [24]:
results
Out[24]:
wissend¶
In [25]:
main_feature = 'wissend'
In [26]:
meta_rel = meta_anth_bin.query("wissend == 1 or unwissend == 1").copy()
In [27]:
meta_rel.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[27]:
wissend 1.000000 wissen 1.000000 nation_volk_d_positiv 0.707107 liebe_positiv 0.632456 ueberlieferung_positiv 0.523785 religion_positiv 0.514286 kollektiv_positiv 0.344265 sprechakt_behaupten_vorhanden 0.334729 geschichtsauffassung_negativ 0.282889 entity_positiv 0.243058 nogenre 0.238077 stoffgebiet_positiv 0.222711 reim 0.209635 wissen_identisch 0.194534 sprechakt_auffordern_vorhanden 0.192846 gegenwartsbezug 0.183776 krieg_positiv 0.181463 mittelraum_count 0.171691 ende 0.171373 sprechakte_count 0.171168 Name: wissend, dtype: float64
In [28]:
meta_rel.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[28]:
unwissend -1.000000 tod_negativ -0.316228 krieg_negativ -0.302614 religion_negativ -0.298807 konkretheit -0.261153 entity_neutral -0.255536 in_hohem_mass_konkret -0.238077 sprechinstanz_in_vergangenheit -0.214824 ueberlieferung_negativ -0.213201 politik_negativ -0.200446 liebe_negativ -0.200000 objektmarker_vorhanden -0.183776 denkmal -0.176233 liebe -0.166221 rollengedicht -0.163107 stoffgebiet_negativ -0.161601 wissen_ergaenzend -0.160328 sprechakt_beschreiben_vorhanden -0.144416 tod -0.136444 anachronismus -0.116760 Name: wissend, dtype: float64
In [29]:
threshold = 0.2
bin_comp_features = get_features(meta_rel.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
bin_comp_features = bin_comp_features + ['rollengedicht']
cont_comp_features = get_features(meta_rel.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
cont_comp_features = cont_comp_features + ['stoffgebiet_neutral']
In [30]:
results = relations_binbin(
meta = meta_rel,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [31]:
directly_related = ['krieg_negativ']
results_filtered = (
results
.query("index not in @directly_related")
.query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
.sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[31]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| kollektiv_positiv | 0.16 | 4/25 | 0.49 | 24/49 | 0.11 | 0.13 | 0.33 | 0.53 | 0.51 | 7.65 | 0.01 | 0.01 | 0.32 | 4.0 | 9.46 |
| sprechakt_behaupten_vorhanden | 0.13 | 8/61 | 0.44 | 33/75 | 0.17 | 0.17 | 0.31 | 0.45 | 0.45 | 15.24 | 0.00 | 0.00 | 0.33 | 8.0 | 18.39 |
| stoffgebiet_positiv | 0.33 | 28/84 | 0.55 | 56/101 | 0.07 | 0.08 | 0.22 | 0.36 | 0.36 | 9.05 | 0.00 | 0.00 | 0.22 | 28.0 | 38.14 |
| nogenre | 0.18 | 11/61 | 0.40 | 30/75 | 0.08 | 0.07 | 0.22 | 0.37 | 0.35 | 7.71 | 0.01 | 0.01 | 0.24 | 11.0 | 18.39 |
| reim | 0.84 | 51/61 | 0.96 | 72/75 | 0.01 | 0.02 | 0.12 | 0.23 | 0.24 | 5.98 | 0.01 | 0.02 | 0.21 | 3.0 | 5.83 |
| sprechinstanz_in_vergangenheit | 0.33 | 20/61 | 0.15 | 11/75 | -0.31 | -0.32 | -0.18 | -0.04 | -0.03 | 6.28 | 0.01 | 0.01 | 0.21 | 11.0 | 13.90 |
| entity_neutral | 0.43 | 51/119 | 0.22 | 32/144 | -0.32 | -0.32 | -0.21 | -0.09 | -0.09 | 12.84 | 0.00 | 0.00 | 0.22 | 32.0 | 37.56 |
| in_hohem_mass_konkret | 0.82 | 50/61 | 0.60 | 45/75 | -0.37 | -0.37 | -0.22 | -0.07 | -0.07 | 7.71 | 0.01 | 0.01 | 0.24 | 11.0 | 18.39 |
In [32]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[32]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| liebe_positiv | 0.20 | 1/5 | 1.00 | 1/1 | 0.40 | 0.45 | 0.80 | 1.15 | 1.00 | 2.40 | 0.12 | 0.33 | 0.63 | 0.0 | 0.33 |
| nation_volk_d_positiv | 0.00 | 0/3 | 0.67 | 2/3 | 0.00 | 0.13 | 0.67 | 1.20 | 1.00 | 3.00 | 0.08 | 0.40 | 0.71 | 0.0 | 1.00 |
| ueberlieferung_positiv | 0.25 | 3/12 | 0.79 | 19/24 | 0.21 | 0.25 | 0.54 | 0.84 | 0.79 | 9.88 | 0.00 | 0.00 | 0.52 | 3.0 | 4.67 |
| religion_positiv | 0.29 | 2/7 | 0.80 | 8/10 | 0.07 | 0.10 | 0.51 | 0.93 | 0.90 | 4.50 | 0.03 | 0.06 | 0.51 | 2.0 | 2.88 |
| geschichtsauffassung_negativ | 0.17 | 1/6 | 0.43 | 3/7 | -0.21 | -0.21 | 0.26 | 0.73 | 0.71 | 1.04 | 0.31 | 0.56 | 0.28 | 1.0 | 1.85 |
| entity_positiv | 0.34 | 40/119 | 0.53 | 77/144 | 0.08 | 0.08 | 0.20 | 0.32 | 0.31 | 10.40 | 0.00 | 0.00 | 0.20 | 40.0 | 52.94 |
| bekanntes_individuum_positiv | 0.41 | 28/69 | 0.58 | 43/74 | 0.02 | 0.01 | 0.18 | 0.34 | 0.35 | 4.39 | 0.04 | 0.04 | 0.18 | 28.0 | 34.26 |
| unbekanntes_individuum_positiv | 0.26 | 5/19 | 0.38 | 5/13 | -0.19 | -0.21 | 0.12 | 0.45 | 0.46 | 0.53 | 0.47 | 0.70 | 0.13 | 5.0 | 4.06 |
| kollektiv_negativ | 0.20 | 5/25 | 0.27 | 13/49 | -0.14 | -0.13 | 0.07 | 0.26 | 0.27 | 0.38 | 0.54 | 0.58 | 0.07 | 5.0 | 6.08 |
| bekanntes_individuum_negativ | 0.10 | 7/69 | 0.16 | 12/74 | -0.05 | -0.05 | 0.06 | 0.17 | 0.17 | 1.14 | 0.29 | 0.33 | 0.09 | 7.0 | 9.17 |
| entity_negativ | 0.13 | 15/119 | 0.18 | 26/144 | -0.03 | -0.03 | 0.05 | 0.14 | 0.14 | 1.47 | 0.23 | 0.24 | 0.07 | 15.0 | 18.55 |
| stoffgebiet_ambivalent | 0.15 | 13/84 | 0.13 | 13/101 | -0.13 | -0.13 | -0.03 | 0.08 | 0.08 | 0.26 | 0.61 | 0.67 | 0.04 | 13.0 | 11.81 |
| entity_ambivalent | 0.11 | 13/119 | 0.06 | 9/144 | -0.12 | -0.12 | -0.05 | 0.02 | 0.02 | 1.86 | 0.17 | 0.19 | 0.08 | 9.0 | 9.95 |
| stoffgebiet_neutral | 0.26 | 22/84 | 0.18 | 18/101 | -0.20 | -0.20 | -0.08 | 0.04 | 0.03 | 1.90 | 0.17 | 0.21 | 0.10 | 18.0 | 18.16 |
| stoffgebiet_negativ | 0.25 | 21/84 | 0.14 | 14/101 | -0.22 | -0.23 | -0.11 | 0.00 | -0.01 | 3.71 | 0.05 | 0.06 | 0.14 | 14.0 | 15.89 |
| ueberlieferung_negativ | 0.17 | 2/12 | 0.04 | 1/24 | -0.38 | -0.35 | -0.12 | 0.10 | 0.08 | 1.64 | 0.20 | 0.25 | 0.21 | 1.0 | 1.00 |
| rollengedicht | 0.26 | 16/61 | 0.13 | 10/75 | -0.27 | -0.26 | -0.13 | 0.01 | 0.01 | 3.62 | 0.06 | 0.08 | 0.16 | 10.0 | 11.66 |
| religion_negativ | 0.14 | 1/7 | 0.00 | 0/10 | -0.43 | -0.40 | -0.14 | 0.12 | 0.00 | 1.52 | 0.22 | 0.41 | 0.30 | 0.0 | 0.41 |
| politik_negativ | 0.29 | 2/7 | 0.12 | 1/8 | -0.59 | -0.57 | -0.16 | 0.24 | 0.25 | 0.60 | 0.44 | 0.57 | 0.20 | 1.0 | 1.40 |
| krieg_negativ | 0.32 | 9/28 | 0.08 | 3/36 | -0.44 | -0.43 | -0.24 | -0.04 | -0.03 | 5.86 | 0.02 | 0.02 | 0.30 | 3.0 | 5.25 |
| tod_negativ | 0.25 | 2/8 | 0.00 | 0/4 | -0.50 | -0.55 | -0.25 | 0.05 | 0.00 | 1.20 | 0.27 | 0.52 | 0.32 | 0.0 | 0.67 |
In [33]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]
results_a = relations_binbin(
meta = meta_rel.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_binbin(
meta = meta_rel.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']
round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[33]:
| wenn_nicht_1850 | wenn_nicht_detail_1850 | wenn_ja_1850 | wenn_ja_detail_1850 | diff_1850 | chi2_p_1850 | phi_1850 | wenn_nicht_1885 | wenn_nicht_detail_1885 | wenn_ja_1885 | wenn_ja_detail_1885 | diff_1885 | chi2_p_1885 | phi_1885 | diff_of_diffs | diff_of_phis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| in_hohem_mass_konkret | 0.864 | 38/44 | 0.603 | 35/58 | -0.260 | 0.004 | 0.286 | 0.706 | 12/17 | 0.588 | 10/17 | -0.118 | 0.473 | 0.123 | 0.143 | -0.163 |
| sprechakt_behaupten_vorhanden | 0.091 | 4/44 | 0.431 | 25/58 | 0.340 | 0.000 | 0.373 | 0.235 | 4/17 | 0.471 | 8/17 | 0.235 | 0.151 | 0.246 | -0.105 | -0.127 |
| nogenre | 0.205 | 9/44 | 0.414 | 24/58 | 0.209 | 0.025 | 0.222 | 0.118 | 2/17 | 0.353 | 6/17 | 0.235 | 0.106 | 0.277 | 0.026 | 0.056 |
| reim | 0.864 | 38/44 | 0.966 | 56/58 | 0.102 | 0.058 | 0.188 | 0.765 | 13/17 | 0.941 | 16/17 | 0.176 | 0.146 | 0.249 | 0.075 | 0.061 |
| stoffgebiet_positiv | 0.381 | 24/63 | 0.587 | 44/75 | 0.206 | 0.016 | 0.205 | 0.190 | 4/21 | 0.462 | 12/26 | 0.271 | 0.051 | 0.284 | 0.065 | 0.079 |
| entity_neutral | 0.396 | 36/91 | 0.212 | 24/113 | -0.183 | 0.004 | 0.200 | 0.536 | 15/28 | 0.258 | 8/31 | -0.278 | 0.029 | 0.284 | -0.094 | 0.084 |
| sprechinstanz_in_vergangenheit | 0.273 | 12/44 | 0.138 | 8/58 | -0.135 | 0.089 | 0.168 | 0.471 | 8/17 | 0.176 | 3/17 | -0.294 | 0.067 | 0.314 | -0.159 | 0.146 |
| kollektiv_positiv | 0.158 | 3/19 | 0.425 | 17/40 | 0.267 | 0.043 | 0.264 | 0.167 | 1/6 | 0.778 | 7/9 | 0.611 | 0.020 | 0.600 | 0.344 | 0.336 |
In [34]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = 'sprechakt_behaupten_vorhanden',
comp_features = ['in_hohem_mass_konkret', 'nogenre']
)
round(results, 2)
Out[34]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| in_hohem_mass_konkret | 0.91 | 1369/1505 | 0.42 | 144/345 | -0.55 | -0.55 | -0.49 | -0.44 | -0.44 | 456.48 | 0.0 | 0.0 | 0.50 | 136.0 | 62.85 |
| nogenre | 0.18 | 266/1505 | 0.45 | 156/345 | 0.22 | 0.22 | 0.28 | 0.33 | 0.33 | 120.92 | 0.0 | 0.0 | 0.26 | 156.0 | 78.70 |
In [35]:
results = relations_bincont(
meta = meta_rel,
main_feature = main_feature,
comp_features = [x for x in cont_comp_features if x != 'wissen']
)
In [36]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 4) # .query("mannwhitneyu_p < 0.05")
Out[36]:
| wenn_nicht | a_merkmal=0 | a_merkmal=1 | a_merkmal=2 | a_merkmal=3 | a_merkmal>=4 | wenn_ja | b_merkmal=0 | b_merkmal=1 | b_merkmal=2 | ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | meandiffs_ci_lower | meandiffs_ci_bootstrap_lower | meandiffs_ci_upper | meandiffs_ci_bootstrap_upper | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| stoffgebiet_neutral | 0.3607 | 0.69 [42/61] | 0.26 [16/61] | 0.05 [3/61] | 0.0 [0/61] | 0.0 [0/61] | 0.2400 | 0.81 [61/75] | 0.13 [10/75] | 0.05 [4/75] | ... | -0.1077 | 0.2121 | 0.2121 | 0.2171 | 2556.0 | 0.1166 | -0.3110 | -0.3052 | 0.0697 | 0.0702 |
| konkretheit | 0.9098 | 0.0 [0/61] | 0.82 [50/61] | 0.0 [0/61] | 0.0 [0/61] | 0.0 [0/61] | 0.7667 | 0.07 [5/75] | 0.6 [45/75] | 0.0 [0/75] | ... | -0.2612 | 0.0021 | 0.0021 | 0.5562 | 2817.5 | 0.0038 | -0.2336 | -0.2314 | -0.0528 | -0.0534 |
2 rows × 22 columns
In [37]:
meta_plot = meta_rel.copy()
for cont_comp_feature in cont_comp_features:
mean_main = meta_plot[meta_plot[main_feature] == 1][cont_comp_feature].mean()
mean_notmain = meta_plot[meta_plot[main_feature] == 0][cont_comp_feature].mean()
label_main = f"Wissend<br>(Mittelwert = {round(mean_main, 2)})"
label_notmain = f"Unwissend<br>(Mittelwert = {round(mean_notmain, 2)})"
meta_plot['plot_legend'] = [label_main if x == 1 else label_notmain for x in meta_plot[main_feature]]
fig = px.histogram(
meta_plot,
x = cont_comp_feature,
color = 'plot_legend',
histnorm = 'probability density',
barmode = 'group',
labels = {'plot_legend' : '',
'entity_neutral' : 'Anzahl neutral bewertete Entitäten',
'stoffgebiet_positiv' : 'Anzahl positiv bewertete Stoffgebiete',
'entity_positiv' : 'Anzahl positiv bewertete Entitäten',
}
)
fig.update_layout(
width = 700, height = 300,
yaxis_title="Anteil",
xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
legend=dict(font = dict(size=16), x=0.6, y = 0.92),
bargap=0.1
)
# fig.write_image(f"plots/6.9 Wissen – {cont_comp_feature}.pdf")
fig.show()
In [38]:
result_categories = ['wenn_nicht', 'wenn_ja', 'mannwhitneyu_p', 'pointbiserialr_corr',]
results_a = relations_bincont(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = [x for x in cont_comp_features if x != 'wissen']
)
results_b = relations_bincont(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = [x for x in cont_comp_features if x != 'wissen']
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']
round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[38]:
| wenn_nicht_1850 | wenn_ja_1850 | mannwhitneyu_p_1850 | pointbiserialr_corr_1850 | wenn_nicht_1885 | wenn_ja_1885 | mannwhitneyu_p_1885 | pointbiserialr_corr_1885 | diff_of_corrs | |
|---|---|---|---|---|---|---|---|---|---|
| konkretheit | 0.911 | 0.776 | 0.000 | -0.128 | 0.900 | 0.735 | 0.015 | -0.137 | -0.009 |
| stoffgebiet_neutral | 0.241 | 0.190 | 0.405 | -0.020 | 0.316 | 0.412 | 0.670 | 0.030 | 0.050 |
Verhältnis zum historischen Wissen¶
In [39]:
meta_plot = ts[[
'uebereinstimmend_share_smoothed',
'ergaenzend_share_smoothed',
'abweichend_natuerlich_share_smoothed',
'abweichend_uebernatuerlich_share_smoothed',
]]
meta_plot.columns = [
'übereinstimmend',
'ergänzend',
'abweichend_natürlich',
'abweichend_übernatürlich',
]
# save_ts_data(meta_plot, prefix='06_09_Historisches_Wissen_')
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil an Texten',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['uebereinstimmend', 'ergaenzend', 'abweichend_natuerlich', 'abweichend_uebernatuerlich']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Verhältnis zum historischen Wissen im Zeitverlauf.pdf")
fig.show()
In [40]:
meta_anth_bin.query("1896<=year<=1898")[['author', 'title', 'wissen_identisch']].sort_values(by='wissen_identisch')
Out[40]:
| author | title | wissen_identisch | |
|---|---|---|---|
| 1078 | Gries, Wilhelm | Zur Feier des hundertjährigen Geburtstages Kai... | 0 |
| 1162 | Scholz, Wilhelm von | Der Strauchritter | 0 |
| 1184 | Wickenburg, Albrecht von | Die Gelbschnäbel von Kolin | 0 |
| 1120 | Avenarius, Ferdinand | Rolands Horn | 0 |
| 1484 | Schafheitlin, Adolf | Des Oheims Erzählung | 0 |
| 1460 | Münchhausen, Börries von | Der Abschied zu Fontainebleau | 0 |
| 1448 | Fischer, Julius | Des Königs Traum | 0 |
| 1414 | Avenarius, Ferdinand | Die Pest | 0 |
| 1392 | Ruseler, Georg | Wittekind vor der Taufe | 0 |
| 1384 | Avenarius, Ferdinand | Tejas Heimfahrt | 0 |
| 1375 | Renner, Gustav | Cäsar | 0 |
| 1374 | Treller, Franz | Die Hörige | 0 |
| 1371 | Avenarius, Ferdinand | Der Breitenstein | 0 |
| 1369 | Renner, Gustav | Siegrund und Helge | 0 |
| 1290 | Seydel, Max von | Sphakteria | 0 |
| 1187 | Wickenburg, Albrecht von | Das letzte Aufgebot | 0 |
| 1497 | Wickenburg, Albrecht von | Des Sandwirts Heimkehr | 0 |
| 1633 | Rüthning, Paul | Der Überfall | 0 |
| 1755 | Schönaich-Carolath, Emil von | Lied des Gefangenen | 0 |
| 1666 | Münchhausen, Börries von | Halfdan, Ragnars Sohn | 0 |
| 1658 | Heyse, Paul | Die Mutter des Siegers | 0 |
| 1588 | Curti, Theod. | Im Tale Schwyz | 0 |
| 1606 | Frey, Adolf | Die Kappelkämpfer | 0 |
| 1158 | Münchhausen, Börries von | Wir. Zu Helm und Schild geboren | 1 |
| 1073 | Delpy, Gustav | Dem Andenken Kaiser Wilhelms I. | 1 |
| 1074 | Greif, Martin | Dem Heldenkaiser | 1 |
| 1109 | Greif, Martin | Königin Luise | 1 |
| 1111 | Greif, Martin | Zur Bestattung Bismarcks im Sachsenwalde | 1 |
| 1146 | Wolff, Julius | Das deutsche Heer | 1 |
| 1076 | Jordan, Wilhelm | Was er vollbracht, ist wunderhaft | 1 |
| 1077 | Nießen, Joseph | Zum 22. März 1897 [Festesklänge hallen wieder] | 1 |
| 1080 | Hoffs, Friedrich van | Zur Gedächtnisfeier des Kaisers Wilhelms I. | 1 |
| 1544 | Saar, Ferdinand von | Mozart | 1 |
| 1485 | Haaß, Robert | Dem Vater des Vaterlandes | 1 |
| 1480 | Wildenbruch, Ernst von | Inschrift an Villa Zirio in San Remo | 1 |
| 1081 | Greif, Martin | Die Kornblume (Zu Kaiser Wilhelms Gedächtnis) | 1 |
| 1075 | Liliencron, A. von | Es tönen die Glocken weit hin durch das Reich | 1 |
| 1072 | Diemar, Adamine von | Zum Todestage der Kaiserin Augusta | 1 |
| 1801 | Fontane, Theodor | Auf der Kuppe der Müggelberge | 1 |
In [41]:
main_feature = 'wissen_identisch'
In [42]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[42]:
wissen_identisch 1.000000 sprechakt_behaupten_vorhanden 0.439494 gegenwartsbezug 0.403737 sprechinstanz_nicht_in_vergangenheit 0.379027 nogenre 0.349070 zustand 0.346266 gegenwartsdominant 0.316185 nationalismus 0.282898 ueberlieferung 0.238014 ende 0.236246 neuzeit 0.226876 sprechinstanz_markiert 0.225085 zeit_mitte 0.223246 liebe_positiv 0.220085 sprechakte_count 0.216678 denkmal 0.211014 beginn 0.205790 sprechakt_beschreiben_vorhanden 0.195068 sonett 0.187559 wissen 0.180619 Name: wissen_identisch, dtype: float64
In [43]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[43]:
wissen_ergaenzend -0.702107 konkretheit -0.609416 in_hohem_mass_konkret -0.596218 sprechakt_erzaehlen_vorhanden -0.445050 ballade -0.382442 ereignis -0.367976 entity_count -0.229154 words -0.204238 kleinraum_count -0.203893 mittelalter -0.176233 objektmarker_vorhanden -0.166505 unbekanntes_individuum_count -0.159578 sprechinstanz_in_vergangenheit -0.157757 bekanntes_individuum_count -0.155486 entity_neutral -0.149800 rollengedicht -0.142517 persmarker_vorhanden -0.136207 empirisch -0.127596 entity_negativ -0.124995 antike -0.117027 Name: wissen_identisch, dtype: float64
In [44]:
threshold = 0.3
bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [45]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [46]:
directly_related = [
'wissen_ergaenzend',
'gegenwartsdominant', # related to gegenwartsbezug
]
results_filtered = (
results
.query("index not in @directly_related")
.query("chi2_p < 0.05 and min_expected >= 5 and phi >= @threshold")
.sort_values(by = 'diff', ascending = False)
)
round(results_filtered, 2)
Out[46]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gegenwartsbezug | 0.22 | 341/1586 | 0.74 | 195/264 | 0.47 | 0.47 | 0.52 | 0.58 | 0.58 | 301.56 | 0.0 | 0.0 | 0.40 | 69.0 | 76.49 |
| sprechakt_behaupten_vorhanden | 0.12 | 185/1586 | 0.61 | 160/264 | 0.43 | 0.43 | 0.49 | 0.55 | 0.55 | 357.34 | 0.0 | 0.0 | 0.44 | 104.0 | 49.23 |
| zustand | 0.35 | 548/1586 | 0.83 | 220/264 | 0.44 | 0.44 | 0.49 | 0.54 | 0.54 | 221.82 | 0.0 | 0.0 | 0.35 | 44.0 | 109.60 |
| sprechinstanz_nicht_in_vergangenheit | 0.21 | 330/1586 | 0.69 | 183/264 | 0.43 | 0.43 | 0.49 | 0.54 | 0.55 | 265.77 | 0.0 | 0.0 | 0.38 | 81.0 | 73.21 |
| nogenre | 0.17 | 267/1586 | 0.59 | 155/264 | 0.35 | 0.36 | 0.42 | 0.48 | 0.48 | 225.42 | 0.0 | 0.0 | 0.35 | 109.0 | 60.22 |
| ereignis | 0.84 | 1328/1586 | 0.40 | 105/264 | -0.50 | -0.50 | -0.44 | -0.38 | -0.38 | 250.50 | 0.0 | 0.0 | 0.37 | 105.0 | 59.51 |
| sprechakt_erzaehlen_vorhanden | 0.84 | 1339/1586 | 0.31 | 81/264 | -0.59 | -0.60 | -0.54 | -0.48 | -0.48 | 366.43 | 0.0 | 0.0 | 0.45 | 81.0 | 61.36 |
| ballade | 0.64 | 1011/1586 | 0.09 | 25/264 | -0.59 | -0.59 | -0.54 | -0.50 | -0.50 | 270.58 | 0.0 | 0.0 | 0.38 | 25.0 | 116.16 |
| in_hohem_mass_konkret | 0.91 | 1446/1586 | 0.25 | 67/264 | -0.71 | -0.71 | -0.66 | -0.60 | -0.60 | 657.63 | 0.0 | 0.0 | 0.60 | 67.0 | 48.09 |
In [47]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='diff', ascending=False), 2)
Out[47]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gegenwartsdominant | 0.10 | 157/1586 | 0.42 | 110/264 | 0.26 | 0.26 | 0.32 | 0.38 | 0.38 | 184.95 | 0.00 | 0.00 | 0.32 | 110.0 | 38.10 |
| stoffgebiet_positiv | 0.43 | 941/2189 | 0.69 | 243/351 | 0.21 | 0.21 | 0.26 | 0.31 | 0.31 | 83.72 | 0.00 | 0.00 | 0.18 | 108.0 | 163.62 |
| bekanntes_individuum_positiv | 0.54 | 992/1823 | 0.79 | 160/202 | 0.19 | 0.19 | 0.25 | 0.31 | 0.32 | 45.57 | 0.00 | 0.00 | 0.15 | 42.0 | 87.08 |
| entity_positiv | 0.46 | 1587/3485 | 0.66 | 284/431 | 0.16 | 0.16 | 0.20 | 0.25 | 0.25 | 63.70 | 0.00 | 0.00 | 0.13 | 147.0 | 205.92 |
| kollektiv_positiv | 0.34 | 315/914 | 0.54 | 81/150 | 0.11 | 0.11 | 0.20 | 0.28 | 0.28 | 21.05 | 0.00 | 0.00 | 0.14 | 69.0 | 55.83 |
| entity_ambivalent | 0.06 | 226/3485 | 0.04 | 18/431 | -0.04 | -0.04 | -0.02 | -0.00 | -0.00 | 3.50 | 0.06 | 0.07 | 0.03 | 18.0 | 26.85 |
| stoffgebiet_ambivalent | 0.14 | 310/2189 | 0.08 | 28/351 | -0.09 | -0.09 | -0.06 | -0.03 | -0.03 | 10.03 | 0.00 | 0.00 | 0.06 | 28.0 | 46.71 |
| entity_negativ | 0.17 | 598/3485 | 0.11 | 46/431 | -0.10 | -0.10 | -0.06 | -0.03 | -0.03 | 11.74 | 0.00 | 0.00 | 0.05 | 46.0 | 70.88 |
| bekanntes_individuum_negativ | 0.14 | 262/1823 | 0.07 | 14/202 | -0.11 | -0.11 | -0.07 | -0.04 | -0.04 | 8.55 | 0.00 | 0.00 | 0.06 | 14.0 | 27.53 |
| kollektiv_negativ | 0.26 | 241/914 | 0.18 | 27/150 | -0.15 | -0.15 | -0.08 | -0.02 | -0.01 | 4.79 | 0.03 | 0.03 | 0.07 | 27.0 | 37.78 |
| stoffgebiet_negativ | 0.22 | 490/2189 | 0.13 | 46/351 | -0.13 | -0.13 | -0.09 | -0.05 | -0.05 | 15.64 | 0.00 | 0.00 | 0.08 | 46.0 | 74.07 |
| stoffgebiet_neutral | 0.20 | 448/2189 | 0.10 | 34/351 | -0.14 | -0.14 | -0.11 | -0.07 | -0.07 | 22.86 | 0.00 | 0.00 | 0.09 | 34.0 | 66.61 |
| entity_neutral | 0.31 | 1074/3485 | 0.19 | 83/431 | -0.15 | -0.16 | -0.12 | -0.08 | -0.08 | 24.62 | 0.00 | 0.00 | 0.08 | 83.0 | 127.34 |
| unbekanntes_individuum_negativ | 0.13 | 81/614 | 0.00 | 0/17 | -0.16 | -0.16 | -0.13 | -0.11 | -0.11 | 2.57 | 0.11 | 0.15 | 0.06 | 0.0 | 2.18 |
| unbekanntes_individuum_positiv | 0.35 | 215/614 | 0.12 | 2/17 | -0.37 | -0.39 | -0.23 | -0.07 | -0.06 | 3.96 | 0.05 | 0.07 | 0.08 | 2.0 | 5.85 |
| wissen_ergaenzend | 0.87 | 1383/1586 | 0.00 | 0/264 | -0.89 | -0.89 | -0.87 | -0.86 | -0.86 | 911.96 | 0.00 | 0.00 | 0.70 | 0.0 | 66.64 |
In [48]:
result_categories = ['wenn_nicht', 'wenn_nicht_detail', 'wenn_ja', 'wenn_ja_detail', 'diff', 'chi2_p', 'phi',]
results_a = relations_binbin(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_binbin(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_diffs'] = results_merged['diff_1885'] - results_merged['diff_1850']
results_merged['diff_of_phis'] = results_merged['phi_1885'] - results_merged['phi_1850']
round(results_merged.sort_values(by = 'diff_of_phis'), 3)
Out[48]:
| wenn_nicht_1850 | wenn_nicht_detail_1850 | wenn_ja_1850 | wenn_ja_detail_1850 | diff_1850 | chi2_p_1850 | phi_1850 | wenn_nicht_1885 | wenn_nicht_detail_1885 | wenn_ja_1885 | wenn_ja_detail_1885 | diff_1885 | chi2_p_1885 | phi_1885 | diff_of_diffs | diff_of_phis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sprechakt_erzaehlen_vorhanden | 0.859 | 1005/1170 | 0.312 | 55/176 | -0.546 | 0.0 | 0.450 | 0.803 | 334/416 | 0.295 | 26/88 | -0.507 | 0.0 | 0.426 | 0.039 | -0.024 |
| ereignis | 0.860 | 1006/1170 | 0.420 | 74/176 | -0.439 | 0.0 | 0.372 | 0.774 | 322/416 | 0.352 | 31/88 | -0.422 | 0.0 | 0.350 | 0.018 | -0.022 |
| ballade | 0.675 | 790/1170 | 0.125 | 22/176 | -0.550 | 0.0 | 0.379 | 0.531 | 221/416 | 0.034 | 3/88 | -0.497 | 0.0 | 0.380 | 0.053 | 0.001 |
| in_hohem_mass_konkret | 0.912 | 1067/1170 | 0.239 | 42/176 | -0.673 | 0.0 | 0.596 | 0.911 | 379/416 | 0.284 | 25/88 | -0.627 | 0.0 | 0.597 | 0.046 | 0.001 |
| zustand | 0.319 | 373/1170 | 0.807 | 142/176 | 0.488 | 0.0 | 0.339 | 0.421 | 175/416 | 0.886 | 78/88 | 0.466 | 0.0 | 0.354 | -0.022 | 0.015 |
| nogenre | 0.142 | 166/1170 | 0.540 | 95/176 | 0.398 | 0.0 | 0.339 | 0.243 | 101/416 | 0.682 | 60/88 | 0.439 | 0.0 | 0.357 | 0.041 | 0.018 |
| sprechakt_behaupten_vorhanden | 0.111 | 130/1170 | 0.591 | 104/176 | 0.480 | 0.0 | 0.427 | 0.132 | 55/416 | 0.636 | 56/88 | 0.504 | 0.0 | 0.462 | 0.024 | 0.035 |
| gegenwartsbezug | 0.225 | 263/1170 | 0.727 | 128/176 | 0.502 | 0.0 | 0.373 | 0.188 | 78/416 | 0.761 | 67/88 | 0.574 | 0.0 | 0.481 | 0.071 | 0.108 |
| sprechinstanz_nicht_in_vergangenheit | 0.226 | 264/1170 | 0.693 | 122/176 | 0.468 | 0.0 | 0.349 | 0.159 | 66/416 | 0.693 | 61/88 | 0.535 | 0.0 | 0.467 | 0.067 | 0.119 |
In [49]:
results = relations_bincont(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = cont_comp_features
)
In [50]:
round(results.sort_values(by = 'pointbiserialr_corr', ascending = False), 2)
Out[50]:
| wenn_nicht | a_merkmal=0 | a_merkmal=1 | a_merkmal=2 | a_merkmal=3 | a_merkmal>=4 | wenn_ja | b_merkmal=0 | b_merkmal=1 | b_merkmal=2 | ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | meandiffs_ci_lower | meandiffs_ci_bootstrap_lower | meandiffs_ci_upper | meandiffs_ci_bootstrap_upper | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| konkretheit | 0.96 | 0.0 [1/1586] | 0.91 [1446/1586] | 0.0 [0/1586] | 0.0 [0/1586] | 0.0 [0/1586] | 0.58 | 0.09 [24/264] | 0.25 [67/264] | 0.0 [0/264] | ... | -0.61 | 0.0 | 0.0 | 1.67 | 348674.5 | 0.0 | -0.4 | -0.41 | -0.35 | -0.34 |
1 rows × 22 columns
In [51]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = 'wissen_ergaenzend',
comp_features = ['in_hohem_mass_konkret']
)
results.T
Out[51]:
| in_hohem_mass_konkret | |
|---|---|
| wenn_nicht | 0.513919 |
| wenn_nicht_detail | 240/467 |
| wenn_ja | 0.920463 |
| wenn_ja_detail | 1273/1383 |
| diff_low_bootstrap | 0.364354 |
| diff_low | 0.359023 |
| diff | 0.406544 |
| diff_high | 0.454065 |
| diff_high_bootstrap | 0.452306 |
| chi2 | 387.308792 |
| chi2_p | 0.0 |
| fisher_p | 0.0 |
| phi | 0.457554 |
| min_real | 110.0 |
| min_expected | 85.06973 |
Geschichtsmarker¶
In [52]:
meta_plot = ts[[
'marker_person_share_smoothed',
'marker_zeit_share_smoothed',
'marker_ort_share_smoothed',
'marker_objekt_share_smoothed',
]]
meta_plot.columns = [
'Person',
'Zeit',
'Ort',
'Objekt',
]
# save_ts_data(meta_plot, prefix='06_09_Geschichtsmarker_einzeln_')
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil an Texten',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['marker_person', 'marker_zeit', 'marker_ort', 'marker_objekt',]
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Geschichtsmarker-Typen im Zeitverlauf (einzeln).pdf")
fig.show()
In [53]:
meta_plot = ts[[
'marker_all_per_text_smoothed',
'marker_text_per_text_smoothed',
'marker_title_per_text_smoothed',
]]
meta_plot.columns = [
'Alle Marker',
'Marker im Text',
'Marker im Titel',
]
save_ts_data(meta_plot, prefix='06_09_Geschichtsmarker_gesamt_')
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Markertypen pro Text',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['marker_all_per_text', 'marker_text_per_text', 'marker_title_per_text']
)
fig = update_fig_for_publication(fig)
fig.write_image(f"plots/6.9 Geschichtsmarker-Typen im Zeitverlauf (gesamt).pdf")
fig.show()
In [54]:
meta_plot = ts[[
'0marker_share_smoothed',
'1marker_share_smoothed',
'2marker_share_smoothed',
'3marker_share_smoothed',
'4marker_share_smoothed',
]]
meta_plot.columns = [
'0 Markertypen',
'1 Markertypen',
'2 Markertypen',
'3 Markertypen',
'4 Markertypen',
]
fig = create_ts_plot(
data = meta_plot,
columns = meta_plot.columns,
y_axis_title = 'Anteil an Texten',
add_corporas = sub_df, add_corpora_names = sub_names,
add_corpora_categories = ['0marker', '1marker', '2marker', '3marker', '4marker',]
)
fig.show()
In [55]:
main_feature = 'marker_count'
In [56]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = False).head(20)
Out[56]:
marker_count 1.000000 objektmarker_vorhanden 0.551354 ortmarker_vorhanden 0.537889 zeitmarker_vorhanden 0.510047 persmarker_vorhanden 0.442834 words 0.300117 zeitebenen 0.294403 nation_volk_d_positiv 0.230049 bekanntes_individuum_count 0.218128 kleinraum_count 0.182637 heroismus 0.168803 antike 0.167671 fixierbarkeit 0.145121 sprechakt_erzaehlen_vorhanden 0.137233 ereignis 0.135113 entity_count 0.134463 anachronismus 0.131773 mittelraum_count 0.124431 ueberlieferung 0.120582 geschichtsauffassung_positiv 0.113113 Name: marker_count, dtype: float64
In [57]:
meta_anth_bin.corr(numeric_only=True)[main_feature].sort_values(ascending = True).head(20)
Out[57]:
neuzeit -0.175863 liebe_negativ -0.175587 ende -0.174842 zeit_mitte -0.170904 beginn -0.163313 religion_positiv -0.157501 unbekanntes_individuum_count -0.131766 ueberlieferung_negativ -0.125123 tod_positiv -0.106239 year_predict_ages_mean -0.092459 ueberlieferung_positiv -0.090329 wissen_identisch -0.088696 lied -0.081683 year -0.080969 politik_negativ -0.073416 decade -0.071780 nogenre -0.071700 behandelt_deutschen_mittelraum -0.070754 sprechakt_behaupten_vorhanden -0.070608 tod_negativ -0.069347 Name: marker_count, dtype: float64
In [58]:
threshold = 0.15
bin_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'bin')
cont_comp_features = get_features(meta_anth_bin.corr(numeric_only=True)[main_feature], threshold = threshold, mode = 'cont')
In [59]:
results = relations_contbin(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = bin_comp_features
)
In [60]:
directly_related = [
'objektmarker_vorhanden', 'ortmarker_vorhanden', 'zeitmarker_vorhanden', 'persmarker_vorhanden',
]
results_filtered = (
results
.query("index not in @directly_related")
.query("mannwhitneyu_p < 0.05 and (pointbiserialr_corr >= @threshold or pointbiserialr_corr <= -@threshold)")
.sort_values(by = 'pointbiserialr_corr', ascending = False)
)
round(results_filtered, 2)
Out[60]:
| wenn marker_count = 0: Anteil Texte mit Feature = ... | wenn marker_count = 1: Anteil Texte mit Feature = ... | wenn marker_count = 2: Anteil Texte mit Feature = ... | wenn marker_count = 3: Anteil Texte mit Feature = ... | wenn marker_count > 3: Anteil Texte mit Feature = ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| heroismus | 0.14285714285714285 [7/49] | 0.1867612293144208 [79/423] | 0.266754270696452 [203/761] | 0.37037037037037035 [180/486] | 0.4122137404580153 [54/131] | 0.17 | 0.00 | 0.00 | -0.38 | 274934.0 | 0.00 |
| antike | 0.02040816326530612 [1/49] | 0.07328605200945626 [31/423] | 0.11038107752956636 [84/761] | 0.18930041152263374 [92/486] | 0.2595419847328244 [34/131] | 0.17 | 0.00 | 0.00 | -0.51 | 142243.0 | 0.00 |
| religion_positiv | 0.5 [1/2] | 0.6515151515151515 [43/66] | 0.46956521739130436 [54/115] | 0.417910447761194 [28/67] | 0.4117647058823529 [7/17] | -0.16 | 0.01 | 0.01 | 0.32 | 10512.0 | 0.01 |
| neuzeit | 0.7755102040816326 [38/49] | 0.6572104018912529 [278/423] | 0.5374507227332457 [409/761] | 0.46502057613168724 [226/486] | 0.3816793893129771 [50/131] | -0.18 | 0.00 | 0.00 | 0.36 | 506026.0 | 0.00 |
In [61]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pointbiserialr_corr', ascending=False), 2)
Out[61]:
| wenn marker_count = 0: Anteil Texte mit Feature = ... | wenn marker_count = 1: Anteil Texte mit Feature = ... | wenn marker_count = 2: Anteil Texte mit Feature = ... | wenn marker_count = 3: Anteil Texte mit Feature = ... | wenn marker_count > 3: Anteil Texte mit Feature = ... | pointbiserialr_corr | pointbiserialr_p | ttest_p | cohens_d | mannwhitneyu_stat | mannwhitneyu_p | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| objektmarker_vorhanden | 0.0 [0/49] | 0.18912529550827423 [80/423] | 0.6254927726675427 [476/761] | 0.8621399176954733 [419/486] | 1.0 [131/131] | 0.55 | 0.00 | 0.00 | -1.36 | 155638.5 | 0.00 |
| ortmarker_vorhanden | 0.0 [0/49] | 0.054373522458628844 [23/423] | 0.15768725361366623 [120/761] | 0.4897119341563786 [238/486] | 1.0 [131/131] | 0.54 | 0.00 | 0.00 | -1.41 | 122779.0 | 0.00 |
| zeitmarker_vorhanden | 0.0 [0/49] | 0.13947990543735225 [59/423] | 0.3455978975032852 [263/761] | 0.6707818930041153 [326/486] | 1.0 [131/131] | 0.51 | 0.00 | 0.00 | -1.19 | 183207.0 | 0.00 |
| persmarker_vorhanden | 0.0 [0/49] | 0.6170212765957447 [261/423] | 0.871222076215506 [663/761] | 0.977366255144033 [475/486] | 1.0 [131/131] | 0.44 | 0.00 | 0.00 | -1.36 | 91982.5 | 0.00 |
| nation_volk_d_positiv | 0.0 [0/2] | 0.7857142857142857 [11/14] | 0.6470588235294118 [11/17] | 0.7647058823529411 [13/17] | 1.0 [6/6] | 0.23 | 0.09 | 0.09 | -0.54 | 229.5 | 0.14 |
| bekanntes_individuum_negativ | 0.0 [0/13] | 0.10984848484848485 [29/264] | 0.2048611111111111 [118/576] | 0.2074074074074074 [84/405] | 0.21100917431192662 [23/109] | 0.08 | 0.00 | 0.00 | -0.22 | 125235.5 | 0.00 |
| entity_positiv | 0.5306122448979592 [26/49] | 0.46335697399527187 [196/423] | 0.4533508541392904 [345/761] | 0.42386831275720166 [206/486] | 0.40458015267175573 [53/131] | 0.07 | 0.00 | 0.89 | 0.01 | 225716.0 | 0.93 |
| entity_negativ | 0.14285714285714285 [7/49] | 0.2293144208037825 [97/423] | 0.28252299605781866 [215/761] | 0.26954732510288065 [131/486] | 0.25190839694656486 [33/131] | 0.07 | 0.00 | 0.06 | -0.10 | 294935.5 | 0.07 |
| entity_ambivalent | 0.04081632653061224 [2/49] | 0.08983451536643026 [38/423] | 0.11432325886990802 [87/761] | 0.09670781893004116 [47/486] | 0.13740458015267176 [18/131] | 0.06 | 0.01 | 0.13 | -0.12 | 148196.0 | 0.19 |
| stoffgebiet_neutral | 0.08163265306122448 [4/49] | 0.17494089834515367 [74/423] | 0.18528252299605782 [141/761] | 0.16666666666666666 [81/486] | 0.183206106870229 [24/131] | 0.04 | 0.06 | 0.51 | -0.04 | 230964.5 | 0.63 |
| stoffgebiet_ambivalent | 0.14285714285714285 [7/49] | 0.15839243498817968 [67/423] | 0.15768725361366623 [120/761] | 0.15432098765432098 [75/486] | 0.22137404580152673 [29/131] | 0.04 | 0.11 | 0.27 | -0.07 | 221141.5 | 0.37 |
| kollektiv_positiv | 0.375 [9/24] | 0.4419889502762431 [80/181] | 0.41590214067278286 [136/327] | 0.44933920704845814 [102/227] | 0.4857142857142857 [34/70] | 0.03 | 0.38 | 0.38 | -0.06 | 81887.5 | 0.43 |
| stoffgebiet_positiv | 0.4489795918367347 [22/49] | 0.425531914893617 [180/423] | 0.46254927726675427 [352/761] | 0.4382716049382716 [213/486] | 0.366412213740458 [48/131] | 0.02 | 0.35 | 0.76 | 0.02 | 349197.5 | 0.86 |
| entity_neutral | 0.42857142857142855 [21/49] | 0.29550827423167847 [125/423] | 0.266754270696452 [203/761] | 0.27983539094650206 [136/486] | 0.25190839694656486 [33/131] | -0.01 | 0.81 | 0.11 | 0.08 | 282883.0 | 0.18 |
| kollektiv_negativ | 0.25 [6/24] | 0.32044198895027626 [58/181] | 0.3302752293577982 [108/327] | 0.29955947136563876 [68/227] | 0.2857142857142857 [20/70] | -0.01 | 0.68 | 0.68 | 0.03 | 75455.0 | 0.63 |
| unbekanntes_individuum_negativ | 0.045454545454545456 [1/22] | 0.232 [29/125] | 0.15083798882681565 [27/179] | 0.16483516483516483 [15/91] | 0.17647058823529413 [3/17] | -0.02 | 0.74 | 0.74 | 0.04 | 14041.5 | 0.54 |
| stoffgebiet_negativ | 0.32653061224489793 [16/49] | 0.26004728132387706 [110/423] | 0.22601839684625494 [172/761] | 0.19958847736625515 [97/486] | 0.183206106870229 [24/131] | -0.02 | 0.45 | 0.01 | 0.15 | 310503.0 | 0.01 |
| bekanntes_individuum_positiv | 0.6153846153846154 [8/13] | 0.696969696969697 [184/264] | 0.7065972222222222 [407/576] | 0.6765432098765433 [274/405] | 0.6422018348623854 [70/109] | -0.03 | 0.30 | 0.30 | 0.06 | 206687.5 | 0.29 |
| unbekanntes_individuum_positiv | 0.5454545454545454 [12/22] | 0.424 [53/125] | 0.39106145251396646 [70/179] | 0.46153846153846156 [42/91] | 0.29411764705882354 [5/17] | -0.03 | 0.49 | 0.49 | 0.07 | 23530.0 | 0.62 |
| liebe_negativ | 0.0 [0/1] | 0.3 [3/10] | 0.08108108108108109 [3/37] | 0.09090909090909091 [1/11] | 0.0 [0/4] | -0.18 | 0.17 | 0.17 | 0.60 | 256.5 | 0.14 |
In [62]:
result_categories = ['pointbiserialr_corr', 'mannwhitneyu_p']
results_a = relations_contbin(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_contbin(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pointbiserialr_corr_1885'] - results_merged['pointbiserialr_corr_1850']
round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[62]:
| pointbiserialr_corr_1850 | mannwhitneyu_p_1850 | pointbiserialr_corr_1885 | mannwhitneyu_p_1885 | diff_of_corrs | |
|---|---|---|---|---|---|
| religion_positiv | -0.133 | 0.032 | -0.256 | 0.076 | -0.123 |
| antike | 0.195 | 0.000 | 0.085 | 0.090 | -0.111 |
| heroismus | 0.182 | 0.000 | 0.104 | 0.024 | -0.078 |
| neuzeit | -0.162 | 0.000 | -0.179 | 0.000 | -0.017 |
In [63]:
results = relations_contbin_ratings(meta_anth_bin, main_feature)
results.sort_values(by = 'pointbiserialr_corr')
0%| | 0/14 [00:00<?, ?it/s]
Out[63]:
| wenn marker_count = 1: Anteil mit Feature = ... | wenn marker_count = 2: Anteil mit Feature = ... | wenn marker_count = 3: Anteil mit Feature = ... | wenn marker_count = 4: Anteil mit Feature = ... | pointbiserialr_corr | pointbiserialr_p | mannwhitneyu_stat | mannwhitneyu_p | |
|---|---|---|---|---|---|---|---|---|
| entity_neutral | 0.3234 [271/838] | 0.2741 [440/1605] | 0.2943 [319/1084] | 0.2805 [85/303] | -0.040430 | 0.011399 | 1528628.0 | 0.027589 |
| bekanntes_individuum_positiv | 0.5917 [213/360] | 0.5807 [500/861] | 0.548 [337/615] | 0.5345 [93/174] | -0.038488 | 0.083357 | 481194.0 | 0.077973 |
| stoffgebiet_negativ | 0.2234 [126/564] | 0.2058 [214/1040] | 0.2171 [152/700] | 0.1547 [28/181] | -0.033441 | 0.091989 | 516954.0 | 0.159493 |
| unbekanntes_individuum_positiv | 0.3403 [65/191] | 0.3425 [87/254] | 0.3594 [46/128] | 0.2083 [5/24] | -0.030496 | 0.444446 | 43808.0 | 0.590164 |
| kollektiv_negativ | 0.262 [60/229] | 0.2642 [112/424] | 0.2386 [68/285] | 0.2366 [22/93] | -0.009726 | 0.751324 | 104848.0 | 0.661277 |
| stoffgebiet_positiv | 0.4805 [271/564] | 0.4625 [481/1040] | 0.4671 [327/700] | 0.4475 [81/181] | -0.009321 | 0.638676 | 794270.5 | 0.627611 |
| unbekanntes_individuum_negativ | 0.1571 [30/191] | 0.126 [32/254] | 0.1172 [15/128] | 0.125 [3/24] | -0.001769 | 0.964622 | 21887.5 | 0.789823 |
| entity_positiv | 0.4773 [400/838] | 0.4822 [774/1605] | 0.4825 [523/1084] | 0.4587 [139/303] | 0.003371 | 0.832993 | 1918894.0 | 0.862723 |
| kollektiv_positiv | 0.3843 [88/229] | 0.3679 [156/424] | 0.3789 [108/285] | 0.3763 [35/93] | 0.013102 | 0.669469 | 133755.0 | 0.746670 |
| stoffgebiet_ambivalent | 0.1294 [73/564] | 0.1308 [136/1040] | 0.1271 [89/700] | 0.1823 [33/181] | 0.020629 | 0.298692 | 382210.0 | 0.397510 |
| entity_negativ | 0.148 [124/838] | 0.1763 [283/1605] | 0.1633 [177/1084] | 0.1749 [53/303] | 0.024670 | 0.122696 | 1087238.0 | 0.176100 |
| stoffgebiet_neutral | 0.1667 [94/564] | 0.201 [209/1040] | 0.1886 [132/700] | 0.2155 [39/181] | 0.028788 | 0.146926 | 514505.5 | 0.177607 |
| entity_ambivalent | 0.0513 [43/838] | 0.0673 [108/1605] | 0.06 [65/1084] | 0.0858 [26/303] | 0.031513 | 0.048621 | 475987.0 | 0.084288 |
| bekanntes_individuum_negativ | 0.0861 [31/360] | 0.1498 [129/861] | 0.1463 [90/615] | 0.1494 [26/174] | 0.053802 | 0.015464 | 261169.5 | 0.019958 |
In [64]:
results = relations_contcont(
meta = meta_anth_bin,
main_feature = main_feature,
comp_features = cont_comp_features
)
In [65]:
directly_related = [
'beginn', 'ende', # related to zeit_mitte
]
results_filtered = (
results
.query("index not in @directly_related")
.query("pearsonr_p < 0.05 and (pearsonr_corr >= @threshold or pearsonr_corr <= -@threshold)")
.sort_values(by = 'pearsonr_corr', ascending = False)
)
round(results_filtered, 2)
Out[65]:
| wenn marker_count = 0: Mittelwert Feature = ... | wenn marker_count = 1: Mittelwert Feature = ... | wenn marker_count = 2: Mittelwert Feature = ... | wenn marker_count = 3: Mittelwert Feature = ... | wenn marker_count > 3: Mittelwert Feature = ... | pearsonr_corr | pearsonr_p | |
|---|---|---|---|---|---|---|---|
| words | 208.34 | 223.73 | 307.93 | 361.20 | 470.13 | 0.30 | 0.0 |
| zeitebenen | 1.59 | 1.74 | 1.96 | 2.25 | 2.63 | 0.29 | 0.0 |
| bekanntes_individuum_count | 0.31 | 0.85 | 1.13 | 1.27 | 1.33 | 0.22 | 0.0 |
| kleinraum_count | 0.41 | 0.52 | 0.66 | 0.76 | 0.84 | 0.18 | 0.0 |
| zeit_mitte | 1667.03 | 1436.41 | 1299.61 | 1200.74 | 1072.13 | -0.17 | 0.0 |
In [66]:
results_other = results.query("index not in @results_filtered.index")
round(results_other.sort_values(by='pearsonr_corr', ascending=False), 2)
Out[66]:
| wenn marker_count = 0: Mittelwert Feature = ... | wenn marker_count = 1: Mittelwert Feature = ... | wenn marker_count = 2: Mittelwert Feature = ... | wenn marker_count = 3: Mittelwert Feature = ... | wenn marker_count > 3: Mittelwert Feature = ... | pearsonr_corr | pearsonr_p | |
|---|---|---|---|---|---|---|---|
| beginn | 1660.52 | 1423.46 | 1282.06 | 1194.55 | 1059.44 | -0.16 | 0.0 |
| ende | 1673.54 | 1449.36 | 1317.15 | 1206.93 | 1084.81 | -0.17 | 0.0 |
In [67]:
meta_plot = meta_anth_bin.copy()
meta_plot = meta_plot.sort_values(by='zeitebenen')
meta_plot['words'] = meta_plot['words'].clip(upper=1250)
meta_plot['zeit_mitte'] = meta_plot['zeit_mitte'].clip(lower=0)
for cont_comp_feature in results_filtered.index:
fig = px.box(
meta_plot,
x = main_feature,
y = cont_comp_feature,
labels = {'zeitebenen' : 'Anzahl Zeitebenen',
'marker_count' : 'Anzahl Geschichtsmarker-Typen',
'zeit_mitte' : 'Mitte der dominanten Zeitebene',
'words' : 'Anzahl Wörter',
'kleinraum_count' : 'Anzahl behandelte Kleinräume',
'bekanntes_individuum_count' : 'Anzahl behandelte<br>bekannte Individuen'
},
color_discrete_sequence=['grey']
)
if cont_comp_feature != 'words' and cont_comp_feature != 'zeit_mitte':
fig.update_traces(boxmean=True)
fig.update_layout(
width = 700, height = 300,
xaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
yaxis=dict(tickfont=dict(size=16), titlefont=dict(size=16)),
legend=dict(font = dict(size=16), x=0.61, y = 0.88),
bargap=0.1
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.9 Geschichtsmarker – {cont_comp_feature}.pdf")
fig.show()
In [68]:
result_categories = ['pearsonr_corr', 'pearsonr_p']
results_a = relations_contcont(
meta = meta_anth_bin.query("1850 <= year <= 1884"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_b = relations_contcont(
meta = meta_anth_bin.query("1885 <= year <= 1918"),
main_feature = main_feature,
comp_features = results_filtered.index
)
results_merged = results_a[result_categories].join(
results_b[result_categories],
lsuffix='_1850', rsuffix = '_1885'
)
results_merged['diff_of_corrs'] = results_merged['pearsonr_corr_1885'].abs() - results_merged['pearsonr_corr_1850'].abs()
round(results_merged.sort_values(by = 'diff_of_corrs'), 3)
Out[68]:
| pearsonr_corr_1850 | pearsonr_p_1850 | pearsonr_corr_1885 | pearsonr_p_1885 | diff_of_corrs | |
|---|---|---|---|---|---|
| zeit_mitte | -0.197 | 0.0 | -0.093 | 0.037 | -0.104 |
| words | 0.299 | 0.0 | 0.268 | 0.000 | -0.031 |
| kleinraum_count | 0.179 | 0.0 | 0.160 | 0.000 | -0.019 |
| zeitebenen | 0.299 | 0.0 | 0.285 | 0.000 | -0.014 |
| bekanntes_individuum_count | 0.190 | 0.0 | 0.254 | 0.000 | 0.064 |
Textlänge¶
In [69]:
meta_anth_bin['period'] = [1 if year >= 1885 else 0 for year in meta_anth_bin['year']]
for interval in [(0, 9999999), (0, 200), (201, 400), (401, 9999999)]:
interval_start = interval[0]
interval_stop = interval[1]
meta_size = meta_anth_bin.query("@interval_start <= words <= @interval_stop")
results = relations_bincont(
meta = meta_size,
main_feature = 'period',
comp_features = ['marker_count']
)
# wenn_nicht = 1850–1884, wenn_ja = 1885–1918
print(f"Texte mit {interval_start}–{interval_stop} Wörtern")
print(f"1850–1884 : {meta_size.query('period == 0').shape[0]}")
print(f"1885–1918 : {meta_size.query('period == 1').shape[0]}")
print(results[['wenn_nicht', 'wenn_ja', 'mannwhitneyu_stat', 'mannwhitneyu_p', 'pointbiserialr_corr']].T)
print("\n")
Texte mit 0–9999999 Wörtern
1850–1884 : 1221
1885–1918 : 502
marker_count
wenn_nicht 2.211302
wenn_ja 1.972112
mannwhitneyu_stat 348270.500000
mannwhitneyu_p 0.000003
pointbiserialr_corr -0.117290
Texte mit 0–200 Wörtern
1850–1884 : 368
1885–1918 : 212
marker_count
wenn_nicht 1.864130
wenn_ja 1.698113
mannwhitneyu_stat 42849.000000
mannwhitneyu_p 0.037070
pointbiserialr_corr -0.087557
Texte mit 201–400 Wörtern
1850–1884 : 523
1885–1918 : 198
marker_count
wenn_nicht 2.216061
wenn_ja 2.121212
mannwhitneyu_stat 54524.500000
mannwhitneyu_p 0.238186
pointbiserialr_corr -0.049897
Texte mit 401–9999999 Wörtern
1850–1884 : 330
1885–1918 : 92
marker_count
wenn_nicht 2.590909
wenn_ja 2.282609
mannwhitneyu_stat 17778.000000
mannwhitneyu_p 0.007912
pointbiserialr_corr -0.140326
Einzelne Markertypen¶
In [70]:
for marker_type in ['persmarker_vorhanden', 'zeitmarker_vorhanden',
'ortmarker_vorhanden', 'objektmarker_vorhanden']:
results = relations_bincont(
meta = meta_anth_bin,
main_feature = marker_type,
comp_features = ['zeit_mitte']
)
print(marker_type)
print(round(results[['wenn_nicht', 'wenn_ja', 'pointbiserialr_corr',
'mannwhitneyu_stat', 'mannwhitneyu_p']].T, 2))
print("\n")
persmarker_vorhanden
zeit_mitte
wenn_nicht 1489.35
wenn_ja 1258.76
pointbiserialr_corr -0.13
mannwhitneyu_stat 302450.00
mannwhitneyu_p 0.00
zeitmarker_vorhanden
zeit_mitte
wenn_nicht 1253.09
wenn_ja 1359.84
pointbiserialr_corr 0.08
mannwhitneyu_stat 340009.50
mannwhitneyu_p 0.00
ortmarker_vorhanden
zeit_mitte
wenn_nicht 1341.97
wenn_ja 1183.58
pointbiserialr_corr -0.10
mannwhitneyu_stat 346646.00
mannwhitneyu_p 0.55
objektmarker_vorhanden
zeit_mitte
wenn_nicht 1472.40
wenn_ja 1181.06
pointbiserialr_corr -0.21
mannwhitneyu_stat 564747.50
mannwhitneyu_p 0.00
In [71]:
results = relations_binbin(
meta = meta_anth_bin,
main_feature = 'heroismus',
comp_features = [
'persmarker_vorhanden', 'zeitmarker_vorhanden',
'ortmarker_vorhanden', 'objektmarker_vorhanden'
]
)
round(results, 2)
Out[71]:
| wenn_nicht | wenn_nicht_detail | wenn_ja | wenn_ja_detail | diff_low_bootstrap | diff_low | diff | diff_high | diff_high_bootstrap | chi2 | chi2_p | fisher_p | phi | min_real | min_expected | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| persmarker_vorhanden | 0.80 | 1068/1327 | 0.88 | 462/523 | 0.04 | 0.04 | 0.08 | 0.11 | 0.11 | 16.18 | 0.00 | 0.00 | 0.09 | 61.0 | 90.46 |
| zeitmarker_vorhanden | 0.41 | 542/1327 | 0.45 | 237/523 | -0.01 | -0.01 | 0.04 | 0.09 | 0.10 | 3.08 | 0.08 | 0.08 | 0.04 | 237.0 | 220.23 |
| ortmarker_vorhanden | 0.23 | 306/1327 | 0.39 | 206/523 | 0.12 | 0.12 | 0.16 | 0.21 | 0.21 | 49.97 | 0.00 | 0.00 | 0.16 | 206.0 | 144.74 |
| objektmarker_vorhanden | 0.58 | 770/1327 | 0.64 | 336/523 | 0.01 | 0.01 | 0.06 | 0.11 | 0.11 | 6.03 | 0.01 | 0.02 | 0.06 | 187.0 | 210.33 |
Korpora¶
In [72]:
meta_anth_early = meta_anth_bin.query("1850<=year<=1884")
meta_muench_bin = binarize_meta(meta_muench)
mannwhitneyu(meta_anth_early['marker_count'], meta_muench_bin['marker_count'])
Out[72]:
MannwhitneyuResult(statistic=109242.5, pvalue=0.001039161435277055)